pacman::p_load(dplyr, readr, tidyr, ggplot2, sf, tmap)

folder = "C:/models/silo/capetown/cape_town_fabilut/silo/"


jobStartDistribution = read_csv(paste(folder, "input/jobStartTimeDistributions.csv",sep  =""))
## Parsed with column specification:
## cols(
##   time = col_double(),
##   Agri = col_double(),
##   Mnft = col_double(),
##   Util = col_double(),
##   Cons = col_double(),
##   Retl = col_double(),
##   Trns = col_double(),
##   Finc = col_double(),
##   Rlst = col_double(),
##   Admn = col_double(),
##   Serv = col_double()
## )
ggplot(data  = jobStartDistribution) + 
  geom_line(inherit.aes = F, aes(x = time/3600, y = Mnft/max(jobStartDistribution$Mnft)), color = "red", size = 1) + 
  geom_line(inherit.aes = F, aes(x = time/3600, y = Admn/max(jobStartDistribution$Admn)), color = "blue", size = 1) + 
  ylab("Rel. frequency (red = manufacturing, blue = services)")

jobDurationDistribution = read_csv(paste(folder, "input/jobDurationDistributions.csv",sep  =""))
## Parsed with column specification:
## cols(
##   time = col_double(),
##   Agri = col_double(),
##   Mnft = col_double(),
##   Util = col_double(),
##   Cons = col_double(),
##   Retl = col_double(),
##   Trns = col_double(),
##   Finc = col_double(),
##   Rlst = col_double(),
##   Admn = col_double(),
##   Serv = col_double()
## )
ggplot(data  = jobDurationDistribution) + 
  geom_line(inherit.aes = F, aes(x = time/3600, y = Mnft/max(jobDurationDistribution$Mnft)), color = "red", size = 1) + 
  geom_line(inherit.aes = F, aes(x = time/3600, y = Admn/max(jobDurationDistribution$Admn)), color = "blue", size = 1) + 
  ylab("Rel. frequency (red = manufacturing, blue = services)")

#Job start and duration: the data is obtained from Mobilität in Deutshcland, based on trip diaries (trip to work). 


tripFrequencyDistribution = read_csv(paste(folder, "input/hts_work_tripLengthFrequencyDistribution.csv",sep  =""))
## Parsed with column specification:
## cols(
##   TravelTime = col_double(),
##   frequency = col_double(),
##   utility = col_double()
## )
ggplot(data  = tripFrequencyDistribution) + 
  geom_line(inherit.aes = F, aes(x = TravelTime, y = utility), color = "black", size = 1) + 
  geom_line(inherit.aes = F, aes(x = TravelTime, y = exp(-0.2 * TravelTime)), color = "orange", size = 1) + 
  ylab("Rel. frequency (red = manufacturing, blue = services)")

#The trip freq. distribution controls the penalty of longer commutes. In the current setting we have the black line, copied from Munich long time ago. The black
# line reproduces the observed travel time distribution. We found afterwards that the black line can lead to too long commute times in the successive years. If this 
# is observed in SILO-Capetown, it could be a good idea to replace this file by an exponential function, such as the orange line = exp(-0.2 * time_minutes).
#This can be simply done by replacing the column utility of this file accordingly. Has to be checked in the changes in commute distance by year.I selected the -0.2 
# value for Munich with a zero-growth scenario, assuming that commute distance has to remain relatively stable. 

#Crime index unfortunately not assigned. All subplaces (region) have same value, equal to 0.5. 
#School quality index unfortunately not assigned. All subplaces (region) have same value, equal to 0.9.

zones = read_csv(paste(folder, "input/zoneSystem.csv",sep  =""))
## Parsed with column specification:
## cols(
##   Zone = col_double(),
##   Area = col_double(),
##   JUR_NAME = col_character(),
##   ID_county = col_double(),
##   ID_city = col_double(),
##   Region = col_double()
## )
development = read_csv(paste(folder, "input/development.csv", sep  =""))
## Parsed with column specification:
## cols(
##   Zone = col_double(),
##   FORMAL = col_double(),
##   SEMIDETACHED = col_double(),
##   MULTIFAMILY = col_double(),
##   BACKYARD_INFORMAL = col_double(),
##   INFORMAL = col_double(),
##   DevCapacity = col_double(),
##   DevLandUse = col_double()
## )
#the last two columns define the development capacity. They are expressed in acres (column DevLandUse). Unless specified, the column DevCapacity is not used, and 
# would define the capacity in dwellings. 
#The previos colums (dummy variables per dd type) simply define wether it is allowed or not to develop a dwelling of the type. They are all "1". 

shp = st_read(paste(folder, "input/zonesShapeFile/zones.shp",sep  =""))
## Reading layer `zones' from data source `C:\models\silo\capetown\cape_town_fabilut\silo\input\zonesShapeFile\zones.shp' using driver `ESRI Shapefile'
## Simple feature collection with 2980 features and 32 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -62801.14 ymin: -3795547 xmax: 13553.27 ymax: -3701471
## projected CRS:  _MI_0
shp = shp %>% left_join(development, by = c("ID_cell" = "Zone"))

map  = tm_basemap(leaflet::providers$CartoDB)
map =  map + tm_shape(shp) +
  tm_polygons(col ="DevLandUse")
tmap_leaflet(map)
## Warning: The shape shp is invalid. See sf::st_is_valid
#The developable land is extracted from the field area_1 (a 1 % of this, as the word doc says).
# Unfortunately, this field seems to be a wrong computation of the area!!
shp$true_area = st_area(shp)
ggplot(shp, aes(x = area_1, y = DevLandUse)) + geom_point()

ggplot(shp, aes(x = as.numeric(true_area), y = DevLandUse)) + geom_point()

pp = read_csv(paste(folder, "microData/pp_2011.csv", sep = ""))
## Parsed with column specification:
## cols(
##   id = col_double(),
##   hhid = col_double(),
##   age = col_double(),
##   gender = col_double(),
##   relationShip = col_character(),
##   occupation = col_double(),
##   driversLicense = col_logical(),
##   workplace = col_double(),
##   income = col_double(),
##   race = col_character()
## )
hh = read_csv(paste(folder, "microData/hh_2011.csv", sep = ""))
## Parsed with column specification:
## cols(
##   id = col_double(),
##   dwelling = col_double(),
##   hhSize = col_double(),
##   autos = col_double()
## )
dd = read_csv(paste(folder, "microData/dd_2011.csv", sep = ""))
## Parsed with column specification:
## cols(
##   id = col_double(),
##   zone = col_double(),
##   type = col_character(),
##   hhID = col_double(),
##   bedrooms = col_double(),
##   quality = col_double(),
##   monthlyCost = col_double(),
##   yearBuilt = col_double(),
##   coordX = col_double(),
##   coordY = col_double()
## )
jj = read_csv(paste(folder, "microData/jj_2011.csv", sep = ""))
## Parsed with column specification:
## cols(
##   id = col_double(),
##   zone = col_double(),
##   personId = col_double(),
##   type = col_character(),
##   coordX = col_double(),
##   coordY = col_double(),
##   startTime = col_double(),
##   duration = col_double()
## )
pp %>% group_by(race) %>% summarize(count = n(), ave_income = mean(income))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   race       count ave_income
##   <chr>      <int>      <dbl>
## 1 BLACK    1494327       329.
## 2 COLOURED 1783284       527.
## 3 OTHER     143124      1177.
## 4 WHITE     641948      2644.
ggplot(pp, aes(x = income, color = race)) + stat_ecdf() + scale_x_log10() + xlab("Log(income)")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1671071 rows containing non-finite values (stat_ecdf).

#The income is defined differently by races. 

pp = pp %>% left_join(hh, by = c("hhid" = "id"))
pp = pp %>% left_join(dd, by = c("dwelling" = "id"))

pp_by_zone_and_race = pp %>% group_by(race,zone) %>% summarize(count = n()) %>% pivot_wider(names_from = "race", values_from = "count", values_fill = 0)
## `summarise()` regrouping output by 'race' (override with `.groups` argument)
shp = st_read(paste(folder, "input/zonesShapeFile/zones.shp",sep  =""))
## Reading layer `zones' from data source `C:\models\silo\capetown\cape_town_fabilut\silo\input\zonesShapeFile\zones.shp' using driver `ESRI Shapefile'
## Simple feature collection with 2980 features and 32 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -62801.14 ymin: -3795547 xmax: 13553.27 ymax: -3701471
## projected CRS:  _MI_0
shp = shp %>% left_join(pp_by_zone_and_race, by = c("ID_cell" = "zone"))
map  = tm_basemap(leaflet::providers$CartoDB)
map =  map + tm_shape(shp, "black") +
  tm_polygons(col ="BLACK", style = "quantile", convert2density = T)
map =  map + tm_shape(shp, "white") +
  tm_polygons(col ="WHITE", style = "quantile", convert2density = T)
tmap_leaflet(map)
## Warning: The shape black is invalid. See sf::st_is_valid
## Warning: The shape white is invalid. See sf::st_is_valid
#The pop density but race seems ok, at least, different and seggregated. Only two of four races are plotted!

jj = jj %>% mutate(is_vacant = if_else(personId==-1, "vacant", "taken"))

jj %>% group_by(type, is_vacant) %>% summarise(n())
## `summarise()` regrouping output by 'type' (override with `.groups` argument)
## # A tibble: 2 x 3
## # Groups:   type [1]
##   type  is_vacant   `n()`
##   <chr> <chr>       <int>
## 1 Empl  taken     1415971
## 2 Empl  vacant      62456
#There is only one job type! The vacant is set to a 4.2%. 

dd = dd %>% mutate(is_vacant = if_else(hhID==-1, "vacant", "taken"))
dd %>% group_by(type, is_vacant) %>% summarise(n(),
                                               ave_cost = mean(monthlyCost),
                                               ave_quality = mean(quality))
## `summarise()` regrouping output by 'type' (override with `.groups` argument)
## # A tibble: 5 x 5
## # Groups:   type [5]
##   type              is_vacant  `n()` ave_cost ave_quality
##   <chr>             <chr>      <int>    <dbl>       <dbl>
## 1 BACKYARD_INFORMAL taken      82832     217.        3.56
## 2 FORMAL            taken     726570     938.        3.98
## 3 INFORMAL          taken     163937     305.        2.88
## 4 MULTIFAMILY       taken     117834     886.        3.99
## 5 SEMIDETACHED      taken      93662     635.        3.97
#In the version of SP in gitlab there is no vacant dwellings!!
#Costs per type look reasonable


dd_by_zone = dd %>% group_by(type,zone) %>% summarize(count = n()) %>% pivot_wider(names_from = "type", values_from = "count", values_fill = 0)
## `summarise()` regrouping output by 'type' (override with `.groups` argument)
shp = st_read(paste(folder, "input/zonesShapeFile/zones.shp",sep  =""))
## Reading layer `zones' from data source `C:\models\silo\capetown\cape_town_fabilut\silo\input\zonesShapeFile\zones.shp' using driver `ESRI Shapefile'
## Simple feature collection with 2980 features and 32 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -62801.14 ymin: -3795547 xmax: 13553.27 ymax: -3701471
## projected CRS:  _MI_0
shp = shp %>% left_join(dd_by_zone, by = c("ID_cell" = "zone"))

map  = tm_basemap(leaflet::providers$CartoDB)
map =  map + tm_shape(shp, "multifamily") +
  tm_polygons(col ="MULTIFAMILY", style = "quantile", convert2density = T)
map =  map + tm_shape(shp, "backyard_informal") +
  tm_polygons(col ="BACKYARD_INFORMAL", style = "quantile", convert2density = T)
tmap_leaflet(map)
## Warning: The shape multifamily is invalid. See sf::st_is_valid
## Warning: The shape backyard_informal is invalid. See sf::st_is_valid
#Seems to be an spatial distribution of dd by type. I do not know if it is correct. Only two of 5 types are plotted.